knitr::opts_chunk$set(
message = FALSE,
warning = TRUE,
include = TRUE,
error = TRUE
)
if (!require(groundhog)) install.packages("groundhog")
groundhog::groundhog.library(c("tidyverse", "faux", "furrr"),
date = "2022-06-14") # older date to accommodate older R version on the R cluster
options(digits = 4, scipen = 999)
theme_set(theme_bw(base_size = 12))
Scenarios = expand_grid(
# norm sample size
N_norm = c(50, 100, 250, 500, 1000),
# local sample size
N = c(25, 100, 175, 250, 500),
# population correlation
roh = c(.1, .3, .5, .7),
# degree of sample selection: none, above average, half an sd above average, an sd above average (stanine scale)
selection = c(-999, 5, 6, 7),
# Simulate each scenario 1000 times
Simulation = 1:1000)
generate_data_and_calculate_r <- function(N_norm, N, roh, selection,
Simulation) {
# Store scenario's parameters
Parameters <- environment() %>% as.list() %>% as_tibble()
# function for calculating disattenuated r
r_disattenuated <- function(x, y, sd_x) {
r_obs <- cor(x,y)
u_x <- sd(x)/sd_x
r_obs/(u_x*sqrt(((1/u_x^2)-1)*r_obs^2+1))
}
# function for norm standardising then calculating regression slope
b_normed <- function(x, y, sd_x, sd_y) {
N <- length(x)
normed_x <- x/sd_x
normed_y <- y/sd_y
(N * sum(normed_x * normed_y) - sum(normed_x) * sum(normed_y)) / (N * sum(normed_x^2) - sum(normed_x)^2)
}
# Generate Dataset
# N draws from a bivariate normal distribution with means = 0, SDs = 1 and correlations as above
x_y <- rnorm_multi(n = 10000, vars = 2, mu = c(5, 5), sd = c(2, 2), r = roh, empirical = TRUE)
x_y_norm_sample <- x_y %>%
# sample from the overall sample with N_norm as defined above
sample_n(N_norm)
# extract x and y along with their lengths to make sure it is equal to N
x_norm <- x_y_norm_sample$X1
y_norm <- x_y_norm_sample$X2
x_y_local_sample <- x_y %>%
# censor the sample, selecting on X to different degrees as defined above
filter(X1 > selection) %>%
# sample from the censored samples with Ns as defined above
sample_n(N)
x_local_sample <- x_y_local_sample$X1
y_local_sample <- x_y_local_sample$X2
# Our three statistical models predict well-being
results <- tibble(
sd_x = sd(x_local_sample),
sd_y = sd(y_local_sample),
# calculate correlations
r_raw = cor(x_local_sample, y_local_sample),
r_disattenuated = r_disattenuated(x_local_sample, y_local_sample, sd(x_norm)),
r_norm_standardised = b_normed(x_local_sample, y_local_sample, sd(x_norm), sd(y_norm,))
) %>%
bind_cols(Parameters, .) %>%
mutate(selection = case_when(
selection == 5 ~ "above average",
selection == 6 ~ "half an sd above average",
selection == 7 ~ "an sd above average",
TRUE ~ "none"
))
}
# sim_results = Scenarios %>%
# pmap(generate_data_and_calculate_r, .progress = T) %>%
# # Combine everything into a data frame
# bind_rows()
# We ran this once
plan(multisession)
sim_results <- Scenarios %>%
furrr::future_pmap(generate_data_and_calculate_r, .progress = T,
.options = furrr::furrr_options(seed = 14)) %>%
# Combine everything into a data frame
bind_rows()
write_rds(sim_results, "sim_results.rds")
sim_results <- readRDS("sim_results.rds")
sim_results_long <- sim_results %>%
pivot_longer(starts_with("r_"), names_to = "r_estimator", values_to = "r_estimate") %>%
mutate(selection = factor(selection, levels =c ("none", "above average", "half an sd above average", "an sd above average")),
bias = r_estimate - roh)
sim_results_long %>%
ggplot(aes(x = as.factor(N), y = bias, group = r_estimator, colour = r_estimator)) +
geom_line(stat = "summary") +
geom_pointrange(stat = "summary", fatten = 1) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "Bias") +
facet_grid(rows = vars(selection), cols = vars(roh))
sim_results_long %>%
ggplot(aes(x = as.factor(N), y = bias, group = r_estimator, colour = r_estimator)) +
geom_line(stat = "summary") +
geom_pointrange(stat = "summary", fatten = 1) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "Bias") +
facet_grid(rows = vars(selection), cols = vars(N_norm))
sim_results_long %>%
ggplot(aes(x = as.factor(N), y = bias, group = r_estimator, colour = r_estimator)) +
geom_line(stat = "summary") +
geom_pointrange(stat = "summary", fatten = 1) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "Bias") +
facet_grid(cols = vars(N_norm))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
ggplot(aes(x = as.factor(N), y = bias, group = r_estimator, colour = r_estimator)) +
geom_line(stat = "summary") +
geom_pointrange(stat = "summary", fatten = 1) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "Bias") +
facet_grid(rows = vars(selection), cols = vars(roh))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
ggplot(aes(x = as.factor(N), y = bias, group = r_estimator, colour = r_estimator)) +
geom_line(stat = "summary") +
geom_pointrange(stat = "summary", fatten = 1) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "Bias") +
facet_grid(rows = vars(selection), cols = vars(N_norm))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
ggplot(aes(x = as.factor(N), y = bias, group = r_estimator, colour = r_estimator)) +
geom_line(stat = "summary") +
geom_pointrange(stat = "summary", fatten = 1) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 9)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "Bias") +
facet_grid(cols = vars(N_norm))
sim_results_long %>%
group_by(N, roh, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = se, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "SE") +
facet_grid(rows = vars(selection), cols = vars(roh))
sim_results_long %>%
group_by(N, N_norm, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = se, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "SE") +
facet_grid(rows = vars(selection), cols = vars(N_norm))
sim_results_long %>%
group_by(N, N_norm, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
arrange(desc(abs(bias))) %>%
group_by() %>%
ggplot(aes(x = as.factor(N), y = se, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "SE") +
facet_grid(cols = vars(N_norm))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
group_by(N, roh, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = se, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "SE") +
facet_grid(rows = vars(selection), cols = vars(roh))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
group_by(N, N_norm, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = se, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "SE") +
facet_grid(rows = vars(selection), cols = vars(N_norm))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
group_by(N, N_norm, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = se, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "SE") +
facet_grid(cols = vars(N_norm))
sim_results_long %>%
group_by(N, roh, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = MSE, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "MSE") +
facet_grid(rows = vars(selection), cols = vars(roh))
sim_results_long %>%
group_by(N, N_norm, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = MSE, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "MSE") +
facet_grid(rows = vars(selection), cols = vars(N_norm))
sim_results_long %>%
group_by(N, N_norm, r_estimator) %>%
summarise(
bias = mean(r_estimate - roh),
MSE = mean((r_estimate - roh)^2),
se = sqrt(sum(r_estimate - mean(r_estimate))^2/(n()-1))
) %>%
arrange(desc(abs(bias))) %>%
group_by() %>%
ggplot(aes(x = as.factor(N), y = MSE, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2", "#e69f00")) +
labs(x = "N", y = "MSE") +
facet_grid(cols = vars(N_norm))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
group_by(N, roh, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = MSE, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "MSE") +
facet_grid(rows = vars(selection), cols = vars(roh))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
group_by(N, N_norm, selection, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = MSE, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "MSE") +
facet_grid(rows = vars(selection), cols = vars(N_norm))
sim_results_long %>%
filter(r_estimator != "r_raw") %>%
group_by(N, N_norm, r_estimator) %>%
summarise(
bias = mean(bias),
se = sqrt(sum((r_estimate - mean(r_estimate))^2) / (n() - 1)),
MSE = mean((r_estimate - roh)^2)) %>%
ggplot(aes(x = as.factor(N), y = MSE, group = r_estimator, colour = r_estimator)) +
geom_line() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.3),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12)
) +
geom_line(y = 0, colour = "black") +
scale_colour_manual(values = c("#009e73", "#0072b2")) +
labs(x = "N", y = "MSE") +
facet_grid(cols = vars(N_norm))